#clean up
rm(list=ls())

#load packabges
library(foreign)

#set seed
set.seed(715130425)

#load raw data
combined<-read.dta('cces08reformattedFine48.dta') #Exclude AK & HI in fitting model
#plot(y=combined$northCent,x=combined$eastCent)
table(combined$state)
length(table(combined$state))

#recodes
combined$cathOrth<-combined$catholic+combined$orthodox
combined$consRelig<-combined$mormon+combined$evangelical
combined$musJew<-combined$islam+combined$jewish

#load kriged points
covariates<-read.csv('krigedPointsFine.csv')
#plot(y=covariates$northings,x=covariates$eastings,pch=".")
table(covariates$STATE)
length(table(covariates$STATE))
length(table(covariates$TRACTA))

#recodes
covariates$cathOrth<-covariates$cath+covariates$ortho
covariates$consRelig<-covariates$mormon+covariates$evan
covariates$musJew<-covariates$islam+covariates$jewish


###start loop #1: random sampling of kriges###
for(k in 1:50){
	#reset collinearity checker
	coll.check<-TRUE
	
	#while loop to create suitable subset
	while(coll.check){
	#create subset of original data
	five.percent<-sample(1:dim(combined)[1],round(.05*dim(combined)[1]))
	subset.data<-combined[five.percent,]
	
	test.mod <- lm(subset.data$ideology~subset.data$age*subset.data$educ+as.factor(subset.data$race)*subset.data$female+
		subset.data$inc14+subset.data$cathOrth+subset.data$consRelig+subset.data$musJew+
		subset.data$mainline+subset.data$rural+subset.data$ownership+as.factor(subset.data$empstat)+
		subset.data$eastings*subset.data$northings+I(subset.data$eastings^2)+I(subset.data$northings^2))
	coll.check<-NA%in%test.mod$coef
	}
	
	write.csv(subset.data,paste('bootstrap/train',k,'.csv',sep=""),row.names=F)
	rm(subset.data)

	#draw 5 percent of kriged points
	five.targets<-sample(1:dim(covariates)[1],round(.05*dim(covariates)[1]))
	subset.covariates<-covariates[five.targets,]
	write.csv(subset.covariates,paste('bootstrap/test',k,'.csv',sep=""),row.names=F)
	rm(subset.covariates)
}


###start loop #2: random sampling for training, but draw from all census tracts for forecasts###
#first split all 23,764 census tracts into 50 lists randomly to cover all. Sample one point out of each tract.
tract.list<-names(table(covariates$TRACTA))
rand.tract.list<-sample(tract.list,23764)
rand.tract.choice<-rep(NA,length(rand.tract.list))
for(i in 1:length(rand.tract.choice)){
	rand.tract.choice[i]<-sample(which(covariates$TRACTA==rand.tract.list[i]),1)	
	}
tract.sample<-list(NA)
for(i in 1:49){
	tract.sample[[i]]<-rand.tract.choice[((i-1)*476+1):((i-1)*476+476)]
	}
tract.sample[[50]]<-rand.tract.choice[23325:23764]

#now run the loop, with the forecasting sample referencing tract.sample
for(k in 51:100){
	#reset collinearity checker
	coll.check<-TRUE
	
	#while loop to create suitable subset
	l<-0
	while(coll.check){
	#create subset of original data
	five.percent<-sample(1:dim(combined)[1],round(.05*dim(combined)[1]))
	subset.data<-combined[five.percent,]
	
	test.mod <- lm(subset.data$ideology~subset.data$age*subset.data$educ+as.factor(subset.data$race)*subset.data$female+
		subset.data$inc14+subset.data$cathOrth+subset.data$consRelig+subset.data$musJew+
		subset.data$mainline+subset.data$rural+subset.data$ownership+as.factor(subset.data$empstat)+
		subset.data$eastings*subset.data$northings+I(subset.data$eastings^2)+I(subset.data$northings^2))
	coll.check<-NA%in%test.mod$coef
	l<-l+1
	print(l)
	}
	
	write.csv(subset.data,paste('bootstrap/train',k,'.csv',sep=""),row.names=F)
	rm(subset.data)

	#call tract subset	
	subset.covariates<-covariates[tract.sample[[k-50]],]
	write.csv(subset.covariates,paste('bootstrap/test',k,'.csv',sep=""),row.names=F)
	rm(subset.covariates)
}
